package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.analyse.tool.ParseLogic;
import com.kdtech.analyse.tool.ParseTool;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;

public class SouthcnNewsAnalyse implements AnalyseNews {

	
	public boolean isDetailPage(String url) {
		//http://news.southcn.com/z/2012-04/13/content_43061548.htm
		//http://qydg.southcn.com/zhxw/201211/t20121123_338079.htm
		boolean bRet=false;
		String[] regex={
				"http://.*[.]southcn[.]com/.*/[0-9]{4}-[0-9]{2}/[0-9]*/content_[0-9]*[.]htm"
				,"http://.*[.]southcn.com/.*/[0-9]{6}/t[0-9]{8}_[0-9]*.htm"
		,"http://leaders.southcn.com/redetails.php\\?id=[0-9]*"

		,"http://leaders.southcn.com/details.php\\?id=[0-9]*"};
		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				bRet=true;
				break;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		if (!isDetailPage(urlMeta.getUrl())) {
		}
		String html=urlMeta.getHtml();
		if ( html== null) {
		}
		String title="";
		String content="";;
		Long date=null;
		String source="南方网";
		String url=urlMeta.getUrl();
		if (url.startsWith("http://jqhz.southcn.com/")) {
			source="崛起惠州";
		}else if(url.startsWith("http://leaders.southcn.com/")){
			source="广东省网络问政平台";
		}
		int commentNum=0;
		int clickNum=0;
		NewsMeta meta=new NewsMeta();
		meta.setUrl(url);
		meta.setType(0);
		Document doc=Jsoup.parse(html);
		urlMeta=null;


		if(url.startsWith("http://leaders.southcn.com/")){
			title=doc.select("html body div.GcW div.Gp1 ul.GsTAL5 li h3 font[color=#000000]").text();
			if(title.startsWith("——")){
				title=title.replace("——", "");
			}
			if(StringUtils.isBlank(title)){
				title=doc.select("html body div.GcW div.Gp1 div.pos a").text();

			}
			date=DateUtils.matchDate(doc.select("html body div.GcW div.Gp1 ul.GsTAL5 li h3 font").text());
			if(date==null){
				date=DateUtils.matchDate(doc.select("html body div.GcW div.Gp1 div.pos:contains(领导回应)").text());

			}
			content=HtmlCleaner.getContentHtml(url,doc.select("html body div.GcW div.Gp1 ul.GsTAL5 li p"));
		}else{
			ParseTool tool=new ParseTool(doc);

			/*
			 * 解析新闻标题
			 */
			tool.addParseLogic(new ParseLogic("#article_title"));
			tool.addParseLogic(new ParseLogic("div#ScDetailTitle"));
			tool.addParseLogic(new ParseLogic("div.bigtitle"));
			tool.addParseLogic(new ParseLogic("span#ScDetailTitle"));
			title= tool.parse();

			/*
			 * 解析时间
			 */
			tool.addParseLogic(new ParseLogic("span.pub_time"));
			tool.addParseLogic(new ParseLogic("span.time"));
			tool.addParseLogic(new ParseLogic("div.contime"));
			tool.addParseLogic(new ParseLogic("div.tips"));
			tool.addParseLogic(new ParseLogic("div.desc"));
			date=tool.parseDate();
			if(date == null)
				date=DateUtils.matchDate(url);

			/*
			 * 解析内容
			 */
			tool.addParseLogic(new ParseLogic("div#ScDetailContent"));
			tool.addParseLogic(new ParseLogic("div.content"));
			tool.addParseLogic(new ParseLogic("div.c_content"));
			content=tool.parse();

		}
		meta.setTitle(title);
		meta.setContent(content);
		meta.setDate(date);
		meta.setClickNum(clickNum);
		meta.setCommentNum(commentNum);


		String author=JSoupUtils.matchAuthor(doc, "来源：");
		meta.setAuthor(author);
		return meta;
	}

	public static void main(String[] args) {
		String url="http://kb.southcn.com/content/2015-04/08/content_121776082.htm";
		UrlMeta urlMeta=CrawlHTML.responseToURL(url);
		SouthcnNewsAnalyse cnhubeiNewsAnalyse=new SouthcnNewsAnalyse();
		boolean detailPage=cnhubeiNewsAnalyse.isDetailPage(url);
		System.out.println(detailPage);
		if(detailPage){
		NewsMeta parserHtml =cnhubeiNewsAnalyse.parserHtml(urlMeta);
		System.out.println(parserHtml);
		}else{
			System.out.println("不符合正则");
		}
	}

	
	public NewsMeta Update(NewsMeta meta) {
		//基本上没人评论，不做解析
		return null;
	}
	
	public boolean isNeedUpdate(){
		return false;
	}
}
